{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70261773",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "'''\n",
    "##Title: Saginaw County ROA Webscraper and Parser\n",
    "##Author: Arkey Barnett\n",
    "##Date Last Modified: 01/09/2024\n",
    "##Purpose: To obtain Saginaw County court history events for scheduling instrument, we aim to scrape and parse ROAs, court history records. \n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33c4c430",
   "metadata": {},
   "outputs": [],
   "source": [
    "################################################################################################################\n",
    "##Importing the Packages\n",
    "################################################################################################################\n",
    "import requests;\n",
    "from lxml import etree;\n",
    "import pandas as pd;\n",
    "import numpy as np;\n",
    "from lxml import html;\n",
    "from lxml.html import fromstring\n",
    "import string;\n",
    "#import requests_file;\n",
    "import time;\n",
    "import glob, os;\n",
    "import re;\n",
    "from datetime import datetime;\n",
    "import math;\n",
    "##Ran the following into my terminal\n",
    "#python -m pip install selenium\n",
    "#python -m pip install webdriver-manager\n",
    "## Location of modules: ./opt/anaconda3/lib/python3.9/site-packages\n",
    "##pip install selenium\n",
    "##pip install webdriver-manager\n",
    "import time; ##allows python to rest between moves…\n",
    "from selenium import webdriver; ##actual webdriver we are going to use\n",
    "from selenium.webdriver.chrome.service import Service; ##best practice way to do it right now\n",
    "from selenium.webdriver.chrome.options import Options;\n",
    "from selenium.webdriver.support.ui import Select; #https://stackoverflow.com/questions/32382415/selenium-python-select\n",
    "from webdriver_manager.chrome import ChromeDriverManager;\n",
    "from selenium.webdriver.common.by import By;\n",
    "from selenium.webdriver.common.keys import Keys;\n",
    "from selenium.webdriver import ActionChains;\n",
    "from selenium.webdriver.support.wait import WebDriverWait; ##https://selenium-python.readthedocs.io/waits.html\n",
    "from selenium.webdriver.support import expected_conditions as EC;\n",
    "from selenium.common.exceptions import WebDriverException #https://stackoverflow.com/questions/26943847/check-whether-element-is-clickable-in-selenium\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4261f811",
   "metadata": {},
   "outputs": [],
   "source": [
    "################################################################################################################\n",
    "##Setting up the Directories\n",
    "################################################################################################################\n",
    "import os;\n",
    "# checking current working directory\n",
    "os.getcwd();\n",
    "#os.chdir('/Users/arkeybarnett');\n",
    "os.chdir('/Users/Success');\n",
    "from pathlib import Path;\n",
    "ROOT = Path('Dropbox/IGNITE/2input_data/saginaw_courts');\n",
    "#ROOT.mkdir()\n",
    "#import time\n",
    "#from openpyxl import load_workbook\n",
    "OUT  = (ROOT / '3ROAs/pre_covid_crime').resolve()\n",
    "Monitor = (ROOT / '5output').resolve()\n",
    "case_info = Monitor / f\"precovcrime_cases13.csv\";\n",
    "source = ROOT / '3ROAs/pre_covid_crime';\n",
    "Case_numbers = ROOT / '2case_number';\n",
    "##inputting case numbers ##10:05AM\n",
    "##Note the new txt files,\n",
    "with open(Case_numbers / 'saginaw_case_precrim_22dec2023_13.txt', 'r') as txt:\n",
    "    active = [x.strip() for x in txt.readlines()];\n",
    "#print(active);\n",
    "len(active);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f17a296",
   "metadata": {},
   "outputs": [],
   "source": [
    "################################################################################################################\n",
    "##Websraping the ROAs\n",
    "################################################################################################################\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7674b70d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "################Creating empty lists to append ROA variables to\n",
    "id_ = [] #order of cases scraped\n",
    "case_id = [] #case number from inputs\n",
    "url = [] #will have the url of roa \n",
    "remain = [] #number of case numbers remaining to scrape\n",
    "\n",
    "################Creating Program to Scrape the ROAs\n",
    "#for case in active:\n",
    "whole = active\n",
    "def gotopage(case):\n",
    "    case_id.append(case)\n",
    "    new = whole.index(case) + 1\n",
    "    id_.append(new)\n",
    "##Next two lines help keep browser open even if processes are completed.\n",
    "    options = Options();\n",
    "#    options.add_experimental_option(\"detach\", True); ##help keep browser open even if processes are completed.\n",
    "    options.add_argument(\"--auto-open-devtools-for-tabs\") #to auto dev tools which helps with the options list https://stackoverflow.com/questions/59365968/opening-inspect-pressing-f12-on-chrome-via-selenium\n",
    "    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options = options);\n",
    "    driver.get(\"https://odysseycourtinformation.saginawcounty.com/Portal/Home/Dashboard/26\") ##Opens the website in chrome\n",
    "    driver.maximize_window(); ##Maximize the window of the browser.\n",
    "    Select(WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, \"//select[@id='cboHSSearchBy']\")))).select_by_value(\"CaseNumber\") ##https://stackoverflow.com/questions/71849933/attributeerror-webelement-object-has-no-attribute-select-by-value-selecting\n",
    "    driver.find_element(By.ID, \"SearchCriteria_SearchValue\").send_keys(case)\n",
    "    driver.find_element(By.ID, \"SearchCriteria_DateFrom\").send_keys(\"01/01/1955\")\n",
    "    driver.find_element(By.ID, \"SearchCriteria_DateTo\").send_keys(\"12/28/2023\")\n",
    "    driver.find_element(By.ID, \"btnHSSubmit\").click()\n",
    "    time.sleep(1) # always be kind to servers -- wait a bit between requests\n",
    "    \n",
    "    passing = driver.find_elements(By.XPATH, \"//button[text()='View']\")\n",
    "    for link in passing:\n",
    "        print(link.get_attribute('outerHTML').find(\"View\"))\n",
    "        if link.get_attribute('outerHTML').find(\"View\") != -1:\n",
    "            try:\n",
    "                link.click()\n",
    "            except WebDriverException:\n",
    "                break\n",
    "            break;\n",
    "    links = driver.find_elements(By.XPATH, '//a[@href]')\n",
    "    for link in links:\n",
    "        print(link.get_attribute('outerHTML').find(\"CaseNumber=\"))\n",
    "        if link.get_attribute('outerHTML').find(\"CaseNumber=\") != -1:\n",
    "            print(link.get_attribute('outerHTML')) #driver.find_elements(By.XPATH, '//a[@href]')\n",
    "            g = link.get_attribute('href')\n",
    "            break \n",
    "        else:\n",
    "            g = \"empty\";\n",
    "    if g != \"empty\":\n",
    "        driver.get(g); #I realize if I do not delete this it will keep iterating the same thing if a case number is empty, smh!\n",
    "        html = driver.execute_script('return document.documentElement.innerHTML;')\n",
    "        outfile  = OUT / (case + '.html')\n",
    "        with open(outfile, \"w\", encoding='utf-8') as f: #ROOT / '2case_number/querycase_ab_29feb2023.xls', 'r'\n",
    "            f.write(html) \n",
    "    time.sleep(1) # always be kind to servers -- wait a bit between requests\n",
    "    url.append(g)\n",
    "    left = len(whole) - whole.index(case) - 1\n",
    "    remain.append(left)\n",
    "    #For monitoring in real time\n",
    "    print(new)\n",
    "    print(case)\n",
    "    print(left)\n",
    "    driver.quit()\n",
    "\n",
    "################Performing the Program to Webscrape ROAs\n",
    "for querycase in whole:\n",
    "    gotopage(querycase)\n",
    "\n",
    "################Saving into a dataframe \n",
    "the_news = pd.DataFrame({\"Order\": id_, \"Case\": case_id, \"ROA\": url, \"Left\": remain});\n",
    "\n",
    "################Exporting the dataframe into csv file for analysis in STATA       \n",
    "# writing to csv file \n",
    "the_news.to_csv(case_info, sep=',', index=False, encoding='utf-8') #https://www.freecodecamp.org/news/dataframe-to-csv-how-to-save-pandas-dataframes-by-exporting/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2d2fc81",
   "metadata": {},
   "outputs": [],
   "source": [
    "################################################################################################################\n",
    "##Parsing the ROAs\n",
    "################################################################################################################"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0eb5543",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "################Creating Program to Parse the ROAs\n",
    "def sag_roa_parser(file):\n",
    "\trecords = {}\n",
    "\ttry:\n",
    "\t\twith open(source/file, \"r\") as f:\n",
    "\t\t\tpage = f.read()\n",
    "\t\t\thtml = etree.HTML(page)\n",
    "\t\t\t#case level info\n",
    "\t\t\trecords[\"case_id\"] = file.strip('.html')\n",
    "\t\t\tcase = html.xpath('//div[@class=\"col-md-4\"]/p//text()')[2].strip('\\n ')            \n",
    "\t\t\trecords[\"roa_case_id\"] = case #https://stackoverflow.com/questions/29555452/lxml-doesnt-get-all-text-in-element-if-text-has-br\n",
    "\t\t\t#Court Information\n",
    "\t\t\trecords[\"court\"] = html.xpath('//div[@class=\"col-md-4\"]/p//text()')[5].strip('\\n ') #https://stackoverflow.com/questions/29555452/lxml-doesnt-get-all-text-in-element-if-text-has-br\n",
    "\t\t\trecords[\"judge_name\"] = html.xpath('//div[@class=\"col-md-4\"]/p//text()')[8].strip('\\n ') #https://stackoverflow.com/questions/29555452/lxml-doesnt-get-all-text-in-element-if-text-has-br\n",
    "\t\t\trecords[\"date_filed\"] = html.xpath('//div[@class=\"col-md-4\"]/p//text()')[11].strip('\\n ') #https://stackoverflow.com/questions/29555452/lxml-doesnt-get-all-text-in-element-if-text-has-br\n",
    "\t\t\trecords[\"case_type\"] = html.xpath('//div[@class=\"col-md-4\"]/p//text()')[14].strip('\\n ') #https://stackoverflow.com/questions/29555452/lxml-doesnt-get-all-text-in-element-if-text-has-br\n",
    "\t\t\trecords[\"case_status\"] = html.xpath('//div[@class=\"col-md-4\"]/p//text()')[17].strip('\\n ') #https://stackoverflow.com/questions/29555452/lxml-doesnt-get-all-text-in-element-if-text-has-br\n",
    "\t\t\trecords[\"entitlement\"] = html.xpath('//div[@class=\"col-md-12\"]/p//text()')[0].strip('\\n ').strip(case).strip(\"| \") #https://stackoverflow.com/questions/29555452/lxml-doesnt-get-all-text-in-element-if-text-has-br\n",
    "\t\t\t#Party\n",
    "\t\t\trecords[\"defendent\"] = html.xpath('//div[@id=\"divPartyInformation_body\"]//div[@class=\"col-md-8\"]/p//text()')[3].strip('\\n ')\n",
    "\t\t\t#....try (doesnt work for case X)\n",
    "\t\t\ttry:\n",
    "\t\t\t\trecords[\"attorney_name\"] = html.xpath('//div[@id=\"divPartyInformation_body\"]//div[@class=\"col-md-4\"]/div[@class=\"tyler-toggle-container\"]/div/div/div/div//text()')[0].strip('\\n ')\n",
    "\t\t\texcept:\n",
    "\t\t\t\trecords[\"attorney_name\"] = \"\"      \n",
    "\t\t\t#....try (doesnt work for case X)\n",
    "\t\t\ttry:\n",
    "\t\t\t\trecords[\"attorney_type\"] = html.xpath('//div[@id=\"divPartyInformation_body\"]//div[@class=\"col-md-4\"]/div[@class=\"tyler-toggle-container\"]/div/p//text()')[0].strip('\\n ')\n",
    "\t\t\texcept:\n",
    "\t\t\t\trecords[\"attorney_type\"] = \"\"\n",
    "\t\t\t#Charges #relevant for criminal cases, need loop\n",
    "\t\t\tc=1\n",
    "\t\t\tcharge_grp= html.xpath('//div[@id=\"chargeInformationDiv\"]/div/div/div/table/tbody/tr[not(@style)]') ##https://stackoverflow.com/questions/2404130/xpath-to-find-elements-that-does-not-have-an-id-or-class\n",
    "\t\t\tprint(charge_grp)\n",
    "\t\t\tfor charge in charge_grp:\n",
    "\t\t\t\trecords[\"c_charge_num{0}\".format(c)] = charge.xpath('./td[2]//text()')\n",
    "\t\t\t\trecords[\"c_current_charge{0}\".format(c)] = charge.xpath('./td[3]//text()')\n",
    "\t\t\t\trecords[\"c_charge_stat{0}\".format(c)] = charge.xpath('./td[4]//text()')\n",
    "\t\t\t\trecords[\"c_charge_level{0}\".format(c)] = charge.xpath('./td[5]//text()')\n",
    "\t\t\t\trecords[\"c_charge_date{0}\".format(c)] = charge.xpath('./td[6]//text()')\n",
    "\t\t\t\tc+=1    \n",
    "\t\t\t#Bond Settings\n",
    "\t\t\t#....try (doesnt work for case X)\n",
    "\t\t\tbs=1\n",
    "\t\t\ttry:\n",
    "\t\t\t\tbond_setdate = html.xpath('//div[@id=\"BondSettingsGrid\"]/table/tbody//text()') ##https://stackoverflow.com/questions/2404130/xpath-to-find-elements-that-does-not-have-an-id-or-class\n",
    "\t\t\texcept:\n",
    "\t\t\t\tbond_setdate = \"\"     \n",
    "\t\t\tprint(bond_setdate)\n",
    "\t\t\tfor bonds in bond_setdate:\n",
    "\t\t\t\trecords[\"bs_bond_setdate{0}\".format(bs)] = bonds ##https://stackoverflow.com/questions/2404130/xpath-to-find-elements-that-does-not-have-an-id-or-class\n",
    "\t\t\t\tbs+=1\n",
    "\t\t\t#Bond Details\n",
    "\t\t\tb=1\n",
    "\t\t\ttry:   \n",
    "\t\t\t\tbond_det = html.xpath('//div[@id=\"BondsGrid\"]/table/tbody/tr/') ##https://stackoverflow.com/questions/2404130/xpath-to-find-elements-that-does-not-have-an-id-or-class\n",
    "\t\t\texcept:\n",
    "\t\t\t\tbond_det = \"\" \n",
    "\t\t\tif bond_det != \"\": \n",
    "\t\t\t\tfor bond in bond_det:\n",
    "\t\t\t\t\trecords[\"b_bond_type{0}\".format(b)] = bond.xpath('./td[1]//text()') ##https://stackoverflow.com/questions/2404130/xpath-to-find-elements-that-does-not-have-an-id-or-class   \n",
    "\t\t\t\t\trecords[\"b_bond_num{0}\".format(b)] = bond.xpath('./td[2]//text()') ##https://stackoverflow.com/questions/2404130/xpath-to-find-elements-that-does-not-have-an-id-or-class\n",
    "\t\t\t\t\trecords[\"b_bond_amnt{0}\".format(b)] = bond.xpath('./td[3]//text()') ##https://stackoverflow.com/questions/2404130/xpath-to-find-elements-that-does-not-have-an-id-or-class\n",
    "\t\t\t\t\trecords[\"b_bond_stat{0}\".format(b)] = bond.xpath('./td[4]//text()') ##https://stackoverflow.com/questions/2404130/xpath-to-find-elements-that-does-not-have-an-id-or-class       \n",
    "\t\t\t\t\tb+=1\n",
    "\t\t\t#Disposition Events\n",
    "\t\t\td=1\n",
    "\t\t\tdisp_event = html.xpath('//div[@id=\"dispositionInformationDiv\"]/div[@class=\"row-buff\"]/div[@class=\"row-buff\"]') ##https://stackoverflow.com/questions/2404130/xpath-to-find-elements-that-does-not-have-an-id-or-class\n",
    "\t\t\tprint(disp_event)\n",
    "\t\t\tfor disp in disp_event:\n",
    "\t\t\t\trecords[\"d_disp_time_event{0}\".format(d)] = [\",\".join([str(disp.xpath('./div[@class=\"tyler-toggle-controller open\"]/p[@class=\"text-primary\"][1]//text()'))])] ##https://stackoverflow.com/questions/2404130/xpath-to-find-elements-that-does-not-have-an-id-or-class             \n",
    "\t\t\t\trecords[\"d_disp_charge_action{0}\".format(d)] = [\",\".join([str(disp.xpath('./div[@class=\"tyler-toggle-container row-buff\"]//text()'))])] ##https://stackoverflow.com/questions/2404130/xpath-to-find-elements-that-does-not-have-an-id-or-class\n",
    "\t\t\t\td+=1    \n",
    "\t\t\t#events and hearings\n",
    "\t\t\te=1\n",
    "\t\t\tevent_grp = html.xpath('//div[@id=\"eventsInformationDiv\"]/ul[@class=\"list-group\"]/li[@class=\"list-group-item\"]/div[@class=\"portal-case-event\"]') ##https://stackoverflow.com/questions/2404130/xpath-to-find-elements-that-does-not-have-an-id-or-class\n",
    "\t\t\tprint(event_grp)\n",
    "\t\t\tfor eve in event_grp:\n",
    "\t\t\t\trecords[\"e_event_date{0}\".format(e)] = [\",\".join([str(eve.xpath('./div/p[@class=\"text-primary\"]//text()'))])] ##https://stackoverflow.com/questions/2404130/xpath-to-find-elements-that-does-not-have-an-id-or-class\n",
    "\t\t\t\trecords[\"e_event_comment{0}\".format(e)] = [\",\".join([str(eve.xpath('./div[@class=\"tyler-toggle-container row-buff\"]//text()'))])] ##https://stackoverflow.com/questions/2404130/xpath-to-find-elements-that-does-not-have-an-id-or-class\n",
    "\t\t\t\te+=1      \n",
    "\texcept:\n",
    "\t\tprint(file);\n",
    "\trecords = pd.DataFrame([records], columns=records.keys());\n",
    "\treturn(records);\n",
    "\n",
    "################Performing Program to Parse the ROAs\n",
    "##creating empty list to add file names to and iterate over\n",
    "files=[];\n",
    "##pulling the file names of the saved ROAs\n",
    "for file in os.listdir(source):\n",
    "\tfiles.append(file);\n",
    "\n",
    "print(files);    \n",
    "\n",
    "data = sag_roa_parser(files[0]) ##The first entry is empty\n",
    "for f in files[1:]:\n",
    "\tdata = pd.concat([data,sag_roa_parser(f)]);\n",
    "\n",
    "################Saving dataframe into excel spreadsheets for analysis in STATA   \n",
    "data.head();\n",
    "data.to_excel(Monitor/'roasparsed_precrim.xlsx', index=False);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05e1c83f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c547f1d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
